In [1]:
import os
import pandas as pd
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
warnings.filterwarnings('ignore')
from lightgbm import LGBMRegressor
import joblib
In [2]:
import numpy as np                   # array, vector, matrix calculations
import pandas as pd                  # DataFrame handling
import shap                          # for consistent, signed variable importance measurements
import xgboost as xgb                # gradient boosting machines (GBMs)

import matplotlib.pyplot as plt      # plotting
pd.options.display.max_columns = 999 # enable display of all columns in notebook

# enables display of plots in notebook
%matplotlib inline

np.random.seed(12345)    

Data wrangle

In [3]:
sales = pd.read_csv('sales_train_evaluation.csv')
sales.name = 'sales'
calendar = pd.read_csv('calendar.csv')
calendar.name = 'calendar'
prices = pd.read_csv('sell_prices.csv')
prices.name = 'prices'
In [4]:
for d in range(1942,1970):
    col = 'd_' + str(d)
    sales[col] = 0
    sales[col] = sales[col].astype(np.int16)  #add sales infoemation from d1942 to d1969 which set as 0
In [5]:
sales_bd = np.round(sales.memory_usage().sum()/(1024*1024),1) #calculate the total memory used before downcast
calendar_bd = np.round(calendar.memory_usage().sum()/(1024*1024),1)
prices_bd = np.round(prices.memory_usage().sum()/(1024*1024),1)
In [6]:
#Downcast in order to save memory
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  

sales = downcast(sales)
prices = downcast(prices)
calendar = downcast(calendar)
In [7]:
sales_ad = np.round(sales.memory_usage().sum()/(1024*1024),1) #calculate the total memory used after downcast
calendar_ad = np.round(calendar.memory_usage().sum()/(1024*1024),1)
prices_ad = np.round(prices.memory_usage().sum()/(1024*1024),1)
In [8]:
dic = {'DataFrame':['sales','calendar','prices'],    
       'Before downcasting':[sales_bd,calendar_bd,prices_bd],
       'After downcasting':[sales_ad,calendar_ad,prices_ad]}

memory = pd.DataFrame(dic)
memory = pd.melt(memory, id_vars='DataFrame', var_name='Status', value_name='Memory (MB)')
memory.sort_values('Memory (MB)',inplace=True)
fig = px.bar(memory, x='DataFrame', y='Memory (MB)', color='Status', barmode='group', text='Memory (MB)')
fig.update_traces(texttemplate='%{text} MB', textposition='outside')
fig.update_layout(template='seaborn', title='Effect of Downcasting')
fig.show()  #plot the effect of downcasting, from the plot we can see that downcasting works well
In [9]:
df = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sold').dropna()
In [10]:
df = pd.merge(df, calendar, on='d', how='left')
df = pd.merge(df, prices, on=['store_id','item_id','wm_yr_wk'], how='left') #merge the three table together
In [11]:
#remove the information of 12-25 of each year, because Christmas Day of each year are outliers and test data doesn't include Chrismas Day
df=df[-(df['date']== "2011-12-25")&-(df['date']== "2012-12-25")&-(df['date']== "2013-12-25")&-(df['date']== "2014-12-25")&-(df['date']== "2015-12-25")]
In [12]:
group = sales.groupby(['state_id','store_id','cat_id','dept_id'],as_index=False)['item_id'].count().dropna()
group['USA'] = 'United States of America'
group.rename(columns={'state_id':'State','store_id':'Store','cat_id':'Category','dept_id':'Department','item_id':'Count'},inplace=True)
In [13]:
group_price_store = df.groupby(['state_id','store_id','item_id'],as_index=False)['sell_price'].mean().dropna()
In [14]:
group_price_cat = df.groupby(['store_id','cat_id','item_id'],as_index=False)['sell_price'].mean().dropna()
In [15]:
group = df.groupby(['year','date','state_id','store_id'], as_index=False)['sold'].sum().dropna() # merge data and dropna
In [16]:
fig = go.Figure()
title = 'Items sold over time'
years = group.year.unique().tolist()
buttons = []
y=3
for state in group.state_id.unique().tolist():
    group_state = group[group['state_id']==state]
    for store in group_state.store_id.unique().tolist():
        group_state_store = group_state[group_state['store_id']==store]
        fig.add_trace(go.Scatter(name=store, x=group_state_store['date'], y=group_state_store['sold'], showlegend=True, 
                                   yaxis='y'+str(y) if y!=1 else 'y'))
    y-=1

fig.update_layout(
        xaxis=dict(
        #autorange=True,
        range = ['2011-01-29','2016-05-22'],
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(count=2,
                     label="2y",
                     step="year",
                     stepmode="backward"),
                dict(count=3,
                     label="3y",
                     step="year",
                     stepmode="backward"),
                dict(count=4,
                     label="4y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            autorange=True,
        ),
        type="date"
    ),
    yaxis=dict(
        anchor="x",
        autorange=True,
        domain=[0, 0.33],
        mirror=True,
        showline=True,
        side="left",
        tickfont={"size":10},
        tickmode="auto",
        ticks="",
        title='WI',
        titlefont={"size":20},
        type="linear",
        zeroline=False
    ),
    yaxis2=dict(
        anchor="x",
        autorange=True,
        domain=[0.33, 0.66],
        mirror=True,
        showline=True,
        side="left",
        tickfont={"size":10},
        tickmode="auto",
        ticks="",
        title = 'TX',
        titlefont={"size":20},
        type="linear",
        zeroline=False
    ),
    yaxis3=dict(
        anchor="x",
        autorange=True,
        domain=[0.66, 1],
        mirror=True,
        showline=True,
        side="left",
        tickfont={"size":10},
        tickmode="auto",
        ticks='',
        title="CA",
        titlefont={"size":20},
        type="linear",
        zeroline=False
    )
    )
fig.update_layout(template='seaborn', title=title)
fig.show() 
#This plot show the items sold by different period and the trends are stable and don't change much by time which means
#Date may not have a great effect on finall result
In [17]:
df['revenue'] = df['sold']*df['sell_price'].astype(np.float32)  #create revenue column
In [18]:
def introduce_nulls(df):
    idx = pd.date_range(df.date.dt.date.min(), df.date.dt.date.max())
    df = df.set_index('date')
    df = df.reindex(idx)
    df.reset_index(inplace=True)
    df.rename(columns={'index':'date'},inplace=True)
    return df

def plot_metric(df,state,store,metric):
    store_sales = df[(df['state_id']==state)&(df['store_id']==store)&(df['date']<='2016-05-22')]
    food_sales = store_sales[store_sales['cat_id']=='FOODS']
    store_sales = store_sales.groupby(['date','snap_'+state],as_index=False)['sold','revenue'].sum()
    snap_sales = store_sales[store_sales['snap_'+state]==1]
    non_snap_sales = store_sales[store_sales['snap_'+state]==0]
    food_sales = food_sales.groupby(['date','snap_'+state],as_index=False)['sold','revenue'].sum()
    snap_foods = food_sales[food_sales['snap_'+state]==1]
    non_snap_foods = food_sales[food_sales['snap_'+state]==0]
    non_snap_sales = introduce_nulls(non_snap_sales)
    snap_sales = introduce_nulls(snap_sales)
    non_snap_foods = introduce_nulls(non_snap_foods)
    snap_foods = introduce_nulls(snap_foods)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=non_snap_sales['date'],y=non_snap_sales[metric],
                           name='Total '+metric+'(Non-SNAP)'))
    fig.add_trace(go.Scatter(x=snap_sales['date'],y=snap_sales[metric],
                           name='Total '+metric+'(SNAP)'))
    fig.add_trace(go.Scatter(x=non_snap_foods['date'],y=non_snap_foods[metric],
                           name='Food '+metric+'(Non-SNAP)'))
    fig.add_trace(go.Scatter(x=snap_foods['date'],y=snap_foods[metric],
                           name='Food '+metric+'(SNAP)'))
    fig.update_yaxes(title_text='Total items sold' if metric=='sold' else 'Total revenue($)')
    fig.update_layout(template='seaborn',title=store)
    fig.update_layout(
        xaxis=dict(
        #autorange=True,
        range = ['2011-01-29','2016-05-22'],
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(count=2,
                     label="2y",
                     step="year",
                     stepmode="backward"),
                dict(count=3,
                     label="3y",
                     step="year",
                     stepmode="backward"),
                dict(count=4,
                     label="4y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            autorange=True,
        ),
        type="date"
    ))
    return fig
In [19]:
cal_data = group.copy()
cal_data = cal_data[cal_data.date <= '22-05-2016']
cal_data['week'] = cal_data.date.dt.weekofyear
cal_data['day_name'] = cal_data.date.dt.day_name()
In [20]:
def calmap(cal_data, state, store, scale):
    cal_data = cal_data[(cal_data['state_id']==state)&(cal_data['store_id']==store)]
    years = cal_data.year.unique().tolist()
    fig = make_subplots(rows=len(years),cols=1,shared_xaxes=True,vertical_spacing=0.005)
    r=1
    for year in years:
        data = cal_data[cal_data['year']==year]
        data = introduce_nulls(data)
        fig.add_trace(go.Heatmap(
            z=data.sold,
            x=data.week,
            y=data.day_name,
            hovertext=data.date.dt.date,
            coloraxis = "coloraxis",name=year,
        ),r,1)
        fig.update_yaxes(title_text=year,tickfont=dict(size=5),row = r,col = 1)
        r+=1
    fig.update_xaxes(range=[1,53],tickfont=dict(size=10), nticks=53)
    fig.update_layout(coloraxis = {'colorscale':scale})
    fig.update_layout(template='seaborn', title=store)
    return fig
In [21]:
d_id = dict(zip(df.id.cat.codes, df.id))
d_item_id = dict(zip(df.item_id.cat.codes, df.item_id))
d_dept_id = dict(zip(df.dept_id.cat.codes, df.dept_id))
d_cat_id = dict(zip(df.cat_id.cat.codes, df.cat_id))
d_store_id = dict(zip(df.store_id.cat.codes, df.store_id))
d_state_id = dict(zip(df.state_id.cat.codes, df.state_id))
In [22]:
del group, group_price_cat, group_price_store, group_state, group_state_store, cal_data
gc.collect();

#2
df.d = df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()
for i,type in enumerate(types):
    if type.name == 'category':
        df[cols[i]] = df[cols[i]].cat.codes
        
#3
df.drop('date',axis=1,inplace=True)
In [23]:
lags = [1,2,3,6,12,24,36]
for lag in lags:
    df['sold_lag_'+str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],as_index=False)['sold'].shift(lag).astype(np.float16)
In [24]:
df['iteam_sold_avg'] = df.groupby('item_id')['sold'].transform('mean').astype(np.float16)
df['state_sold_avg'] = df.groupby('state_id')['sold'].transform('mean').astype(np.float16)
df['store_sold_avg'] = df.groupby('store_id')['sold'].transform('mean').astype(np.float16)
df['cat_sold_avg'] = df.groupby('cat_id')['sold'].transform('mean').astype(np.float16)
df['dept_sold_avg'] = df.groupby('dept_id')['sold'].transform('mean').astype(np.float16)
df['cat_dept_sold_avg'] = df.groupby(['cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
df['store_item_sold_avg'] = df.groupby(['store_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['cat_item_sold_avg'] = df.groupby(['cat_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['dept_item_sold_avg'] = df.groupby(['dept_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_sold_avg'] = df.groupby(['state_id','store_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_cat_sold_avg'] = df.groupby(['state_id','store_id','cat_id'])['sold'].transform('mean').astype(np.float16)
df['store_cat_dept_sold_avg'] = df.groupby(['store_id','cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
In [25]:
df['rolling_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.rolling(window=7).mean()).astype(np.float16)
In [26]:
df['expanding_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.expanding(2).mean()).astype(np.float16)
In [27]:
df['daily_avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','d'])['sold'].transform('mean').astype(np.float16)
df['avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform('mean').astype(np.float16)
df['selling_trend'] = (df['daily_avg_sold'] - df['avg_sold']).astype(np.float16)
df.drop(['daily_avg_sold','avg_sold'],axis=1,inplace=True)
In [28]:
df = df[df['d']>=36]
In [29]:
df #finall dataset used to build model
Out[29]:
id item_id dept_id cat_id store_id state_id d sold wm_yr_wk weekday wday month year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price revenue sold_lag_1 sold_lag_2 sold_lag_3 sold_lag_6 sold_lag_12 sold_lag_24 sold_lag_36 iteam_sold_avg state_sold_avg store_sold_avg cat_sold_avg dept_sold_avg cat_dept_sold_avg store_item_sold_avg cat_item_sold_avg dept_item_sold_avg state_store_sold_avg state_store_cat_sold_avg store_cat_dept_sold_avg rolling_sold_mean expanding_sold_mean selling_trend
1067150 14370 1437 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.217041 1.218750 1.307617 0.562500 0.697754 0.697754 0.322266 0.217041 0.217041 1.307617 0.803711 1.022461 0.0 0.000000 -0.322266
1067151 14380 1438 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.260498 1.218750 1.307617 0.562500 0.697754 0.697754 0.254639 0.260498 0.260498 1.307617 0.803711 1.022461 0.0 0.000000 -0.254639
1067152 14390 1439 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.076904 1.218750 1.307617 0.562500 0.697754 0.697754 0.157349 0.076904 0.076904 1.307617 0.803711 1.022461 0.0 0.000000 -0.157349
1067153 14400 1440 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 4.339844 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 2.017578 1.218750 1.307617 0.562500 0.697754 0.697754 1.699219 2.017578 2.017578 1.307617 0.803711 1.022461 0.0 0.000000 -1.699219
1067154 14410 1441 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.757324 1.218750 1.307617 0.562500 0.697754 0.697754 0.961426 0.757324 0.757324 1.307617 0.803711 1.022461 0.0 0.000000 -0.961426
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60034805 14329 1432 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.799805 1.030273 1.092773 1.626953 2.037109 2.037109 0.529297 0.799805 0.799805 1.092773 1.706055 2.214844 0.0 0.529297 -0.529297
60034806 14339 1433 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.480469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.431396 1.030273 1.092773 1.626953 2.037109 2.037109 0.370605 0.431396 0.431396 1.092773 1.706055 2.214844 0.0 0.370605 -0.370605
60034807 14349 1434 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 3.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.706055 1.030273 1.092773 1.626953 2.037109 2.037109 0.882812 0.706055 0.706055 1.092773 1.706055 2.214844 0.0 0.882812 -0.882812
60034808 14359 1435 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.280273 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.638672 1.030273 1.092773 1.626953 2.037109 2.037109 0.376221 0.638672 0.638672 1.092773 1.706055 2.214844 0.0 0.376221 -0.376221
60034809 14369 1436 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.618164 1.030273 1.092773 1.626953 2.037109 2.037109 0.694336 0.618164 0.618164 1.092773 1.706055 2.214844 0.0 0.694336 -0.694336

58815210 rows × 44 columns

In [30]:
df.to_pickle('data.pkl')
del df
gc.collect();  #save data into data.pkl
In [31]:
df = pd.read_pickle('data.pkl')
In [32]:
df
Out[32]:
id item_id dept_id cat_id store_id state_id d sold wm_yr_wk weekday wday month year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price revenue sold_lag_1 sold_lag_2 sold_lag_3 sold_lag_6 sold_lag_12 sold_lag_24 sold_lag_36 iteam_sold_avg state_sold_avg store_sold_avg cat_sold_avg dept_sold_avg cat_dept_sold_avg store_item_sold_avg cat_item_sold_avg dept_item_sold_avg state_store_sold_avg state_store_cat_sold_avg store_cat_dept_sold_avg rolling_sold_mean expanding_sold_mean selling_trend
1067150 14370 1437 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.217041 1.218750 1.307617 0.562500 0.697754 0.697754 0.322266 0.217041 0.217041 1.307617 0.803711 1.022461 0.0 0.000000 -0.322266
1067151 14380 1438 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.260498 1.218750 1.307617 0.562500 0.697754 0.697754 0.254639 0.260498 0.260498 1.307617 0.803711 1.022461 0.0 0.000000 -0.254639
1067152 14390 1439 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.076904 1.218750 1.307617 0.562500 0.697754 0.697754 0.157349 0.076904 0.076904 1.307617 0.803711 1.022461 0.0 0.000000 -0.157349
1067153 14400 1440 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 4.339844 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 2.017578 1.218750 1.307617 0.562500 0.697754 0.697754 1.699219 2.017578 2.017578 1.307617 0.803711 1.022461 0.0 0.000000 -1.699219
1067154 14410 1441 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.757324 1.218750 1.307617 0.562500 0.697754 0.697754 0.961426 0.757324 0.757324 1.307617 0.803711 1.022461 0.0 0.000000 -0.961426
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60034805 14329 1432 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.799805 1.030273 1.092773 1.626953 2.037109 2.037109 0.529297 0.799805 0.799805 1.092773 1.706055 2.214844 0.0 0.529297 -0.529297
60034806 14339 1433 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.480469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.431396 1.030273 1.092773 1.626953 2.037109 2.037109 0.370605 0.431396 0.431396 1.092773 1.706055 2.214844 0.0 0.370605 -0.370605
60034807 14349 1434 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 3.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.706055 1.030273 1.092773 1.626953 2.037109 2.037109 0.882812 0.706055 0.706055 1.092773 1.706055 2.214844 0.0 0.882812 -0.882812
60034808 14359 1435 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.280273 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.638672 1.030273 1.092773 1.626953 2.037109 2.037109 0.376221 0.638672 0.638672 1.092773 1.706055 2.214844 0.0 0.376221 -0.376221
60034809 14369 1436 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.618164 1.030273 1.092773 1.626953 2.037109 2.037109 0.694336 0.618164 0.618164 1.092773 1.706055 2.214844 0.0 0.694336 -0.694336

58815210 rows × 44 columns

Model Building

In [34]:
data = df
valid1 = data[(data['d']>=1914) & (data['d']<1942)][['id','d','sold']]
test1 = data[data['d']>=1942][['id','d','sold']]
eval_preds = test1['sold']
valid_preds = valid1['sold']  #split data into train valid and test
In [42]:
df
Out[42]:
id item_id dept_id cat_id store_id state_id d sold wm_yr_wk weekday wday month year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price revenue sold_lag_1 sold_lag_2 sold_lag_3 sold_lag_6 sold_lag_12 sold_lag_24 sold_lag_36 iteam_sold_avg state_sold_avg store_sold_avg cat_sold_avg dept_sold_avg cat_dept_sold_avg store_item_sold_avg cat_item_sold_avg dept_item_sold_avg state_store_sold_avg state_store_cat_sold_avg store_cat_dept_sold_avg rolling_sold_mean expanding_sold_mean selling_trend
1067150 14370 1437 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.217041 1.21875 1.307617 0.562500 0.697754 0.697754 0.322266 0.217041 0.217041 1.307617 0.803711 1.022461 0.0 0.000000 -0.322266
1067151 14380 1438 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.260498 1.21875 1.307617 0.562500 0.697754 0.697754 0.254639 0.260498 0.260498 1.307617 0.803711 1.022461 0.0 0.000000 -0.254639
1067152 14390 1439 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.076904 1.21875 1.307617 0.562500 0.697754 0.697754 0.157349 0.076904 0.076904 1.307617 0.803711 1.022461 0.0 0.000000 -0.157349
1067153 14400 1440 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 4.339844 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 2.017578 1.21875 1.307617 0.562500 0.697754 0.697754 1.699219 2.017578 2.017578 1.307617 0.803711 1.022461 0.0 0.000000 -1.699219
1067154 14410 1441 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.757324 1.21875 1.307617 0.562500 0.697754 0.697754 0.961426 0.757324 0.757324 1.307617 0.803711 1.022461 0.0 0.000000 -0.961426
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60007364 14320 1432 2 0 0 0 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.799805 1.21875 1.307617 1.626953 2.037109 2.037109 0.581055 0.799805 0.799805 1.307617 1.938477 2.470703 0.0 0.581055 -0.581055
60007365 14330 1433 2 0 0 0 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.480469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.431396 1.21875 1.307617 1.626953 2.037109 2.037109 0.709473 0.431396 0.431396 1.307617 1.938477 2.470703 0.0 0.709473 -0.709473
60007366 14340 1434 2 0 0 0 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 3.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.706055 1.21875 1.307617 1.626953 2.037109 2.037109 0.950195 0.706055 0.706055 1.307617 1.938477 2.470703 0.0 0.950195 -0.950195
60007367 14350 1435 2 0 0 0 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.280273 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.638672 1.21875 1.307617 1.626953 2.037109 2.037109 0.911621 0.638672 0.638672 1.307617 1.938477 2.470703 0.0 0.911621 -0.911621
60007368 14360 1436 2 0 0 0 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.618164 1.21875 1.307617 1.626953 2.037109 2.037109 0.857422 0.618164 0.618164 1.307617 1.938477 2.470703 0.0 0.857422 -0.857422

5881521 rows × 44 columns

In [43]:
y = 'sold'
X = [name for name in df.columns if name not in [y,'id']]
print('y =', y)
print('X =', X)  #extract indexes 
y = sold
X = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'revenue', 'sold_lag_1', 'sold_lag_2', 'sold_lag_3', 'sold_lag_6', 'sold_lag_12', 'sold_lag_24', 'sold_lag_36', 'iteam_sold_avg', 'state_sold_avg', 'store_sold_avg', 'cat_sold_avg', 'dept_sold_avg', 'cat_dept_sold_avg', 'store_item_sold_avg', 'cat_item_sold_avg', 'dept_item_sold_avg', 'state_store_sold_avg', 'state_store_cat_sold_avg', 'store_cat_dept_sold_avg', 'rolling_sold_mean', 'expanding_sold_mean', 'selling_trend']
In [45]:
stores = sales.store_id.cat.codes.unique().tolist()  
#because of my computer doesn't have enough memory to load the whole data, so I decide to create 10 models for each 10
#stores separately, and use each model to predict and merge the final result.
for store in stores:
    df = data[data['store_id']==store]
    
    
    
    
    train = df[df['d']<1914]
    valid = df[(df['d']>=1914) & (df['d']<1942)]
    test=df[df['d']>=1942]
    
    
    dtrain = xgb.DMatrix(train[X], train[y])
    dtest = xgb.DMatrix(test[X], test[y])
    dvaild=xgb.DMatrix(valid[X], valid[y])
   
    params = {
    
        'booster': 'gbtree',                        # base learner will be decision tree
        'eval_metric': 'rmse',                       # stop training based on maximum AUC, AUC always between 0-1
        'eta': 0.08,                                # learning rate
        'subsample': 0.8,                           # use 80% of rows in each decision tree
        'colsample_bytree': 0.8,                    # use 80% of columns in each decision tree
        'max_depth': 8,                            # allow decision trees to grow to depth of 8
   
    
        'seed': 12345                               # set random seed for reproducibility
    }
    watchlist = [(dtrain, 'train'), (dvaild, 'eval')]
    xgb_model = xgb.train(params,                   # set tuning parameters from above                   
                          dtrain,                   # training data
                          1000,                     # maximum of 1000 iterations (trees)
                          evals=watchlist,          # use watchlist for early stopping 
                          early_stopping_rounds=15, # stop after 15 iterations (trees) without increase in AUC
                          verbose_eval=5)    
    
    
    
    
    
    
    
        
        
        
    eval_preds[test.index]= xgb_model.predict(dtest)
    valid_preds[valid.index] = xgb_model.predict(dvaild)
    #valid_preds[X_valid.index] = model.predict(X_valid)
    #eval_preds[X_test.index] = model.predict(X_test)
    shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
    shap.summary_plot(shap_values[:, :-1], valid1[xgb_model.feature_names])
    
    
    
    filename = 'xgbmodel'+str(d_store_id[store])+'.pkl'
    # save model
    joblib.dump(xgb_model, filename)
    del xgb_model, train, valid, test #, y_valid
    gc.collect()
[03:43:41] WARNING: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:139: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:3.83537	eval-rmse:3.47593
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:2.64920	eval-rmse:2.36318
[10]	train-rmse:1.91910	eval-rmse:1.68243
[15]	train-rmse:1.43383	eval-rmse:1.20662
[20]	train-rmse:1.13586	eval-rmse:0.91785
[25]	train-rmse:0.96527	eval-rmse:0.75539
[30]	train-rmse:0.87296	eval-rmse:0.66989
[35]	train-rmse:0.82103	eval-rmse:0.63076
[40]	train-rmse:0.78879	eval-rmse:0.60466
[45]	train-rmse:0.76561	eval-rmse:0.59029
[50]	train-rmse:0.74905	eval-rmse:0.58175
[55]	train-rmse:0.73571	eval-rmse:0.57486
[60]	train-rmse:0.72702	eval-rmse:0.56797
[65]	train-rmse:0.71712	eval-rmse:0.56452
[70]	train-rmse:0.70980	eval-rmse:0.56092
[75]	train-rmse:0.70125	eval-rmse:0.55758
[80]	train-rmse:0.69488	eval-rmse:0.55657
[85]	train-rmse:0.68848	eval-rmse:0.55498
[90]	train-rmse:0.68261	eval-rmse:0.55327
[95]	train-rmse:0.67729	eval-rmse:0.55251
[100]	train-rmse:0.66904	eval-rmse:0.55107
[105]	train-rmse:0.66195	eval-rmse:0.54992
[110]	train-rmse:0.65679	eval-rmse:0.54806
[115]	train-rmse:0.65374	eval-rmse:0.54829
[120]	train-rmse:0.64939	eval-rmse:0.54867
[125]	train-rmse:0.64563	eval-rmse:0.54894
Stopping. Best iteration:
[111]	train-rmse:0.65645	eval-rmse:0.54797

[03:58:23] WARNING: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:139: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:2.60054	eval-rmse:3.07647
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:1.78391	eval-rmse:2.08519
[10]	train-rmse:1.28134	eval-rmse:1.47674
[15]	train-rmse:0.93367	eval-rmse:1.06243
[20]	train-rmse:0.71199	eval-rmse:0.81450
[25]	train-rmse:0.58452	eval-rmse:0.67205
[30]	train-rmse:0.51615	eval-rmse:0.59576
[35]	train-rmse:0.47999	eval-rmse:0.56094
[40]	train-rmse:0.45509	eval-rmse:0.53561
[45]	train-rmse:0.43822	eval-rmse:0.52401
[50]	train-rmse:0.42516	eval-rmse:0.51203
[55]	train-rmse:0.41509	eval-rmse:0.50360
[60]	train-rmse:0.40615	eval-rmse:0.49874
[65]	train-rmse:0.39920	eval-rmse:0.49528
[70]	train-rmse:0.39375	eval-rmse:0.49184
[75]	train-rmse:0.39040	eval-rmse:0.48941
[80]	train-rmse:0.38728	eval-rmse:0.48567
[85]	train-rmse:0.38272	eval-rmse:0.48247
[90]	train-rmse:0.37916	eval-rmse:0.48180
[95]	train-rmse:0.37573	eval-rmse:0.47965
[100]	train-rmse:0.37369	eval-rmse:0.48252
[105]	train-rmse:0.37096	eval-rmse:0.48128
[110]	train-rmse:0.36901	eval-rmse:0.48082
Stopping. Best iteration:
[95]	train-rmse:0.37573	eval-rmse:0.47965

[04:11:19] WARNING: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:139: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:5.91056	eval-rmse:4.66703
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:4.08523	eval-rmse:3.12873
[10]	train-rmse:2.94246	eval-rmse:2.17238
[15]	train-rmse:2.19653	eval-rmse:1.53513
[20]	train-rmse:1.72504	eval-rmse:1.13278
[25]	train-rmse:1.44978	eval-rmse:0.89686
[30]	train-rmse:1.29903	eval-rmse:0.77085
[35]	train-rmse:1.21858	eval-rmse:0.70998
[40]	train-rmse:1.16573	eval-rmse:0.67770
[45]	train-rmse:1.13395	eval-rmse:0.66176
[50]	train-rmse:1.10599	eval-rmse:0.65189
[55]	train-rmse:1.08708	eval-rmse:0.64390
[60]	train-rmse:1.07225	eval-rmse:0.64198
[65]	train-rmse:1.05756	eval-rmse:0.64094
[70]	train-rmse:1.04642	eval-rmse:0.64042
[75]	train-rmse:1.04055	eval-rmse:0.63927
[80]	train-rmse:1.03761	eval-rmse:0.63863
[85]	train-rmse:1.03597	eval-rmse:0.63871
[90]	train-rmse:1.03386	eval-rmse:0.63860
[95]	train-rmse:1.02779	eval-rmse:0.63982
[100]	train-rmse:1.02447	eval-rmse:0.63931
[105]	train-rmse:1.01844	eval-rmse:0.63805
[110]	train-rmse:1.01785	eval-rmse:0.63799
[115]	train-rmse:1.01219	eval-rmse:0.63854
[120]	train-rmse:1.00320	eval-rmse:0.63846
[125]	train-rmse:0.99789	eval-rmse:0.63704
[130]	train-rmse:0.99240	eval-rmse:0.63798
[135]	train-rmse:0.99060	eval-rmse:0.63833
Stopping. Best iteration:
[124]	train-rmse:0.99810	eval-rmse:0.63698

[04:27:47] WARNING: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:139: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:1.87753	eval-rmse:1.84980
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:1.27973	eval-rmse:1.24731
[10]	train-rmse:0.92424	eval-rmse:0.88484
[15]	train-rmse:0.67618	eval-rmse:0.63105
[20]	train-rmse:0.51673	eval-rmse:0.46994
[25]	train-rmse:0.42415	eval-rmse:0.37738
[30]	train-rmse:0.37275	eval-rmse:0.32947
[35]	train-rmse:0.34669	eval-rmse:0.30771
[40]	train-rmse:0.32860	eval-rmse:0.29341
[45]	train-rmse:0.31737	eval-rmse:0.28598
[50]	train-rmse:0.30765	eval-rmse:0.28096
[55]	train-rmse:0.30161	eval-rmse:0.27943
[60]	train-rmse:0.29564	eval-rmse:0.27787
[65]	train-rmse:0.28974	eval-rmse:0.27823
[70]	train-rmse:0.28553	eval-rmse:0.27910
[75]	train-rmse:0.28180	eval-rmse:0.28011
Stopping. Best iteration:
[63]	train-rmse:0.29240	eval-rmse:0.27730

[04:37:08] WARNING: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:139: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:3.12347	eval-rmse:2.94498
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:2.16989	eval-rmse:1.98597
[10]	train-rmse:1.58548	eval-rmse:1.39390
[15]	train-rmse:1.18640	eval-rmse:0.99849
[20]	train-rmse:0.94085	eval-rmse:0.75933
[25]	train-rmse:0.80174	eval-rmse:0.63262
[30]	train-rmse:0.72562	eval-rmse:0.56617
[35]	train-rmse:0.68320	eval-rmse:0.53887
[40]	train-rmse:0.65307	eval-rmse:0.52413
[45]	train-rmse:0.63027	eval-rmse:0.51588
[50]	train-rmse:0.61250	eval-rmse:0.50972
[55]	train-rmse:0.60189	eval-rmse:0.50762
[60]	train-rmse:0.59275	eval-rmse:0.50506
[65]	train-rmse:0.58530	eval-rmse:0.50239
[70]	train-rmse:0.57995	eval-rmse:0.50153
[75]	train-rmse:0.57606	eval-rmse:0.50016
[80]	train-rmse:0.57193	eval-rmse:0.49787
[85]	train-rmse:0.56972	eval-rmse:0.49750
[90]	train-rmse:0.56738	eval-rmse:0.49704
[95]	train-rmse:0.56569	eval-rmse:0.49667
[100]	train-rmse:0.56203	eval-rmse:0.49647
[105]	train-rmse:0.55800	eval-rmse:0.49508
[110]	train-rmse:0.55384	eval-rmse:0.49454
[115]	train-rmse:0.54859	eval-rmse:0.49425
[120]	train-rmse:0.54432	eval-rmse:0.49384
[125]	train-rmse:0.53955	eval-rmse:0.49371
[130]	train-rmse:0.53601	eval-rmse:0.49468
[135]	train-rmse:0.53147	eval-rmse:0.49347
[140]	train-rmse:0.52682	eval-rmse:0.49250
[145]	train-rmse:0.52193	eval-rmse:0.49026
[150]	train-rmse:0.51887	eval-rmse:0.48957
[155]	train-rmse:0.51441	eval-rmse:0.48799
[160]	train-rmse:0.51047	eval-rmse:0.48798
[165]	train-rmse:0.50614	eval-rmse:0.48734
[170]	train-rmse:0.50323	eval-rmse:0.48837
[175]	train-rmse:0.49943	eval-rmse:0.48900
[180]	train-rmse:0.49612	eval-rmse:0.48886
Stopping. Best iteration:
[168]	train-rmse:0.50522	eval-rmse:0.48698

[04:56:13] WARNING: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:139: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:4.16411	eval-rmse:3.50619
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:2.87127	eval-rmse:2.35941
[10]	train-rmse:2.07333	eval-rmse:1.65207
[15]	train-rmse:1.53903	eval-rmse:1.18955
[20]	train-rmse:1.21455	eval-rmse:0.92141
[25]	train-rmse:1.03222	eval-rmse:0.77160
[30]	train-rmse:0.93552	eval-rmse:0.70006
[35]	train-rmse:0.87490	eval-rmse:0.66783
[40]	train-rmse:0.83660	eval-rmse:0.64922
[45]	train-rmse:0.80982	eval-rmse:0.64133
[50]	train-rmse:0.78680	eval-rmse:0.63561
[55]	train-rmse:0.77148	eval-rmse:0.63247
[60]	train-rmse:0.75962	eval-rmse:0.63011
[65]	train-rmse:0.74898	eval-rmse:0.62936
[70]	train-rmse:0.73971	eval-rmse:0.62707
[75]	train-rmse:0.73116	eval-rmse:0.62550
[80]	train-rmse:0.72126	eval-rmse:0.62532
[85]	train-rmse:0.71559	eval-rmse:0.62421
[90]	train-rmse:0.70772	eval-rmse:0.62392
[95]	train-rmse:0.70160	eval-rmse:0.62294
[100]	train-rmse:0.69661	eval-rmse:0.62315
[105]	train-rmse:0.68994	eval-rmse:0.62353
[110]	train-rmse:0.68547	eval-rmse:0.62072
[115]	train-rmse:0.68010	eval-rmse:0.62147
[120]	train-rmse:0.67627	eval-rmse:0.62167
[125]	train-rmse:0.67079	eval-rmse:0.62169
Stopping. Best iteration:
[110]	train-rmse:0.68547	eval-rmse:0.62072

[05:09:40] WARNING: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:139: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:3.55335	eval-rmse:3.44137
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:2.45079	eval-rmse:2.32990
[10]	train-rmse:1.77504	eval-rmse:1.65474
[15]	train-rmse:1.32673	eval-rmse:1.22048
[20]	train-rmse:1.05885	eval-rmse:0.96474
[25]	train-rmse:0.90100	eval-rmse:0.82489
[30]	train-rmse:0.80644	eval-rmse:0.75766
[35]	train-rmse:0.75828	eval-rmse:0.72586
[40]	train-rmse:0.72349	eval-rmse:0.70864
[45]	train-rmse:0.70176	eval-rmse:0.70173
[50]	train-rmse:0.68529	eval-rmse:0.69530
[55]	train-rmse:0.67322	eval-rmse:0.68966
[60]	train-rmse:0.66247	eval-rmse:0.68541
[65]	train-rmse:0.65289	eval-rmse:0.68283
[70]	train-rmse:0.64565	eval-rmse:0.68130
[75]	train-rmse:0.63965	eval-rmse:0.67799
[80]	train-rmse:0.63406	eval-rmse:0.67655
[85]	train-rmse:0.62981	eval-rmse:0.67400
[90]	train-rmse:0.62308	eval-rmse:0.67229
[95]	train-rmse:0.61846	eval-rmse:0.67149
[100]	train-rmse:0.61291	eval-rmse:0.66905
[105]	train-rmse:0.60663	eval-rmse:0.66700
[110]	train-rmse:0.60138	eval-rmse:0.66591
[115]	train-rmse:0.59363	eval-rmse:0.66612
[120]	train-rmse:0.59061	eval-rmse:0.66451
[125]	train-rmse:0.58696	eval-rmse:0.66260
[130]	train-rmse:0.58298	eval-rmse:0.66372
[135]	train-rmse:0.57760	eval-rmse:0.66262
[140]	train-rmse:0.57357	eval-rmse:0.66116
[145]	train-rmse:0.56907	eval-rmse:0.66165
[150]	train-rmse:0.56576	eval-rmse:0.66202
[155]	train-rmse:0.56066	eval-rmse:0.66332
Stopping. Best iteration:
[143]	train-rmse:0.57074	eval-rmse:0.66112

[05:26:12] WARNING: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:139: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:2.28129	eval-rmse:2.48943
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:1.55368	eval-rmse:1.66708
[10]	train-rmse:1.10905	eval-rmse:1.16225
[15]	train-rmse:0.80610	eval-rmse:0.82400
[20]	train-rmse:0.61162	eval-rmse:0.61432
[25]	train-rmse:0.49844	eval-rmse:0.49566
[30]	train-rmse:0.43516	eval-rmse:0.43423
[35]	train-rmse:0.40214	eval-rmse:0.40684
[40]	train-rmse:0.38032	eval-rmse:0.39095
[45]	train-rmse:0.36659	eval-rmse:0.38298
[50]	train-rmse:0.35790	eval-rmse:0.37755
[55]	train-rmse:0.35095	eval-rmse:0.37413
[60]	train-rmse:0.34346	eval-rmse:0.36920
[65]	train-rmse:0.33816	eval-rmse:0.36682
[70]	train-rmse:0.33402	eval-rmse:0.36404
[75]	train-rmse:0.33018	eval-rmse:0.36266
[80]	train-rmse:0.32795	eval-rmse:0.36189
[85]	train-rmse:0.32555	eval-rmse:0.36141
[90]	train-rmse:0.32393	eval-rmse:0.36053
[95]	train-rmse:0.32228	eval-rmse:0.35988
[100]	train-rmse:0.32142	eval-rmse:0.35960
[105]	train-rmse:0.31906	eval-rmse:0.35928
[110]	train-rmse:0.31729	eval-rmse:0.35874
[115]	train-rmse:0.31367	eval-rmse:0.35713
[120]	train-rmse:0.31220	eval-rmse:0.35637
[125]	train-rmse:0.30907	eval-rmse:0.35645
[130]	train-rmse:0.30582	eval-rmse:0.35620
[135]	train-rmse:0.30345	eval-rmse:0.35573
[140]	train-rmse:0.30053	eval-rmse:0.35492
[145]	train-rmse:0.29862	eval-rmse:0.35376
[150]	train-rmse:0.29598	eval-rmse:0.35298
[155]	train-rmse:0.29522	eval-rmse:0.35223
[160]	train-rmse:0.29344	eval-rmse:0.35264
[165]	train-rmse:0.29182	eval-rmse:0.35210
[170]	train-rmse:0.29004	eval-rmse:0.35170
[175]	train-rmse:0.28753	eval-rmse:0.35116
[180]	train-rmse:0.28578	eval-rmse:0.35112
[185]	train-rmse:0.28401	eval-rmse:0.35064
[190]	train-rmse:0.28164	eval-rmse:0.35002
[195]	train-rmse:0.27927	eval-rmse:0.35031
[200]	train-rmse:0.27677	eval-rmse:0.35015
[205]	train-rmse:0.27472	eval-rmse:0.35007
[210]	train-rmse:0.27312	eval-rmse:0.34962
[215]	train-rmse:0.27166	eval-rmse:0.35024
[220]	train-rmse:0.27012	eval-rmse:0.35020
Stopping. Best iteration:
[209]	train-rmse:0.27353	eval-rmse:0.34958

[05:49:09] WARNING: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:139: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:3.64827	eval-rmse:4.71322
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:2.49903	eval-rmse:3.18903
[10]	train-rmse:1.79457	eval-rmse:2.28053
[15]	train-rmse:1.31427	eval-rmse:1.69038
[20]	train-rmse:1.01856	eval-rmse:1.36862
[25]	train-rmse:0.84960	eval-rmse:1.20036
[30]	train-rmse:0.75852	eval-rmse:1.11756
[35]	train-rmse:0.70881	eval-rmse:1.08088
[40]	train-rmse:0.67824	eval-rmse:1.05963
[45]	train-rmse:0.66018	eval-rmse:1.04749
[50]	train-rmse:0.64715	eval-rmse:1.04000
[55]	train-rmse:0.63537	eval-rmse:1.03753
[60]	train-rmse:0.62794	eval-rmse:1.03559
[65]	train-rmse:0.61867	eval-rmse:1.03395
[70]	train-rmse:0.61099	eval-rmse:1.03349
[75]	train-rmse:0.60376	eval-rmse:1.03223
[80]	train-rmse:0.60237	eval-rmse:1.03184
[85]	train-rmse:0.59953	eval-rmse:1.03048
[90]	train-rmse:0.59608	eval-rmse:1.03087
[95]	train-rmse:0.59303	eval-rmse:1.02833
[100]	train-rmse:0.59130	eval-rmse:1.02703
[105]	train-rmse:0.58739	eval-rmse:1.02589
[110]	train-rmse:0.58454	eval-rmse:1.02344
[115]	train-rmse:0.58216	eval-rmse:1.02297
[120]	train-rmse:0.57982	eval-rmse:1.02340
[125]	train-rmse:0.57613	eval-rmse:1.02038
[130]	train-rmse:0.57218	eval-rmse:1.01955
[135]	train-rmse:0.56814	eval-rmse:1.01689
[140]	train-rmse:0.56440	eval-rmse:1.01880
[145]	train-rmse:0.55838	eval-rmse:1.01911
Stopping. Best iteration:
[134]	train-rmse:0.56914	eval-rmse:1.01612

[06:04:44] WARNING: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:139: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:3.78810	eval-rmse:3.61889
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:2.61678	eval-rmse:2.45657
[10]	train-rmse:1.89819	eval-rmse:1.74675
[15]	train-rmse:1.42087	eval-rmse:1.27551
[20]	train-rmse:1.12728	eval-rmse:0.99009
[25]	train-rmse:0.95874	eval-rmse:0.83357
[30]	train-rmse:0.86454	eval-rmse:0.75434
[35]	train-rmse:0.81381	eval-rmse:0.71695
[40]	train-rmse:0.77890	eval-rmse:0.69361
[45]	train-rmse:0.75434	eval-rmse:0.67996
[50]	train-rmse:0.73506	eval-rmse:0.66578
[55]	train-rmse:0.72361	eval-rmse:0.65691
[60]	train-rmse:0.71347	eval-rmse:0.65625
[65]	train-rmse:0.70328	eval-rmse:0.65242
[70]	train-rmse:0.69695	eval-rmse:0.64952
[75]	train-rmse:0.69396	eval-rmse:0.64901
[80]	train-rmse:0.68949	eval-rmse:0.64707
[85]	train-rmse:0.68567	eval-rmse:0.64643
[90]	train-rmse:0.68124	eval-rmse:0.64567
[95]	train-rmse:0.67897	eval-rmse:0.64587
[100]	train-rmse:0.67462	eval-rmse:0.64330
[105]	train-rmse:0.67189	eval-rmse:0.64267
[110]	train-rmse:0.66873	eval-rmse:0.64110
[115]	train-rmse:0.66661	eval-rmse:0.64031
[120]	train-rmse:0.66407	eval-rmse:0.63972
[125]	train-rmse:0.66186	eval-rmse:0.63949
[130]	train-rmse:0.65884	eval-rmse:0.63908
[135]	train-rmse:0.65680	eval-rmse:0.63852
[140]	train-rmse:0.65521	eval-rmse:0.63780
[145]	train-rmse:0.65406	eval-rmse:0.63775
[150]	train-rmse:0.65119	eval-rmse:0.63933
[155]	train-rmse:0.64816	eval-rmse:0.64009
[160]	train-rmse:0.64481	eval-rmse:0.64023
Stopping. Best iteration:
[145]	train-rmse:0.65406	eval-rmse:0.63775

In [46]:
actual = False
if actual == False:
    #Get the validation results(We already have them as less than one month left for competition to end)
    validation = sales[['id']+['d_' + str(i) for i in range(1914,1942)]]
    validation['id']=pd.read_csv('sales_train_validation.csv').id
    validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
else:
    #Get the actual validation results
    valid['sold'] = valid_preds
    validation = valid[['id','d','sold']]
    validation = pd.pivot(validation, index='id', columns='d', values='sold').reset_index()
    validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
    validation.id = validation.id.map(d_id).str.replace('evaluation','validation')
#creat validation set to be submitted 
In [47]:
validation
Out[47]:
id F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28
0 HOBBIES_1_001_CA_1_validation 0 0 0 2 0 3 5 0 0 1 1 0 2 1 2 2 1 0 2 4 0 0 0 0 3 3 0 1
1 HOBBIES_1_002_CA_1_validation 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 2 1 1 0 0 0 0 0
2 HOBBIES_1_003_CA_1_validation 0 0 1 1 0 2 1 0 0 0 0 2 1 3 0 0 1 0 1 0 2 0 0 0 2 3 0 1
3 HOBBIES_1_004_CA_1_validation 0 0 1 2 4 1 6 4 0 0 0 2 2 4 2 1 1 1 1 1 0 4 0 1 3 0 2 6
4 HOBBIES_1_005_CA_1_validation 1 0 2 3 1 0 3 2 3 1 1 3 2 3 2 2 2 2 0 0 0 2 1 0 0 2 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
30485 FOODS_3_823_WI_3_validation 0 0 0 2 2 0 0 0 2 0 0 1 1 1 0 0 1 0 1 0 3 0 1 1 0 0 1 1
30486 FOODS_3_824_WI_3_validation 0 1 1 1 0 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
30487 FOODS_3_825_WI_3_validation 0 0 1 1 0 2 1 1 0 0 1 0 0 1 0 3 3 1 0 0 1 2 0 1 0 1 0 2
30488 FOODS_3_826_WI_3_validation 1 3 0 1 2 1 0 2 1 1 2 0 2 1 1 0 2 1 1 1 1 4 6 0 1 1 1 0
30489 FOODS_3_827_WI_3_validation 0 0 0 0 0 1 1 1 2 0 1 3 2 1 1 0 0 0 1 2 0 5 4 0 2 2 5 1

30490 rows × 29 columns

In [51]:
actual = False
if actual == False:
    #Get the validation results(We already have them as less than one month left for competition to end)
    validation = sales[['id']+['d_' + str(i) for i in range(1914,1942)]]
    validation['id']=pd.read_csv('sales_train_validation.csv').id
    validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
else:
    #Get the actual validation results
    valid1['sold'] = valid_preds
    validation = valid1[['id','d','sold']]
    validation = pd.pivot(validation, index='id', columns='d', values='sold').reset_index()
    validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
    validation.id = validation.id.map(d_id).str.replace('evaluation','validation')
In [54]:
test1['sold'] = eval_preds
evaluation = test1[['id','d','sold']]
evaluation = pd.pivot(evaluation, index='id', columns='d', values='sold').reset_index()
evaluation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
#Remap the category id to their respective categories
evaluation.id = evaluation.id.map(d_id)

#Prepare the submission
submit = pd.concat([validation,evaluation]).reset_index(drop=True)
submit.to_csv('xgboostsub.csv',index=False)
In [55]:
subxg=pd.read_csv('xgboostsub.csv')
subxg  # we got the final result to be submitted 
Out[55]:
id F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28
0 HOBBIES_1_001_CA_1_validation 0.000000 0.000000 0.000000 2.000000 0.000000 3.000000 5.000000 0.000000 0.000000 1.000000 1.000000 0.000000 2.000000 1.000000 2.000000 2.000000 1.000000 0.000000 2.000000 4.000000 0.000000 0.000000 0.000000 0.000000 3.000000 3.000000 0.000000 1.000000
1 HOBBIES_1_002_CA_1_validation 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 2.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2 HOBBIES_1_003_CA_1_validation 0.000000 0.000000 1.000000 1.000000 0.000000 2.000000 1.000000 0.000000 0.000000 0.000000 0.000000 2.000000 1.000000 3.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 2.000000 0.000000 0.000000 0.000000 2.000000 3.000000 0.000000 1.000000
3 HOBBIES_1_004_CA_1_validation 0.000000 0.000000 1.000000 2.000000 4.000000 1.000000 6.000000 4.000000 0.000000 0.000000 0.000000 2.000000 2.000000 4.000000 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 4.000000 0.000000 1.000000 3.000000 0.000000 2.000000 6.000000
4 HOBBIES_1_005_CA_1_validation 1.000000 0.000000 2.000000 3.000000 1.000000 0.000000 3.000000 2.000000 3.000000 1.000000 1.000000 3.000000 2.000000 3.000000 2.000000 2.000000 2.000000 2.000000 0.000000 0.000000 0.000000 2.000000 1.000000 0.000000 0.000000 2.000000 1.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60975 HOUSEHOLD_2_516_TX_2_evaluation -0.005275 0.001601 0.000766 0.006293 0.006293 -0.003832 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200 -0.001200
60976 HOUSEHOLD_2_516_TX_3_evaluation -0.000304 -0.008338 0.010343 0.007954 0.003547 -0.009774 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812 -0.001812
60977 HOUSEHOLD_2_516_WI_1_evaluation -0.000218 0.007209 0.007209 -0.004227 -0.001106 -0.001172 -0.001172 -0.001106 -0.001106 -0.001840 -0.001106 -0.001106 -0.001172 -0.001172 -0.001106 -0.001106 -0.001106 -0.001106 -0.001106 -0.001172 -0.001172 -0.001106 -0.001106 -0.001106 -0.001106 -0.001106 -0.001172 -0.001172
60978 HOUSEHOLD_2_516_WI_2_evaluation -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820 -0.000820
60979 HOUSEHOLD_2_516_WI_3_evaluation -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204 -0.001204

60980 rows × 29 columns

Shap by store id

In [87]:
#shap
In [4]:
dfshap = pd.read_pickle('data.pkl')
In [6]:
y = 'sold'
X = [name for name in dfshap.columns if name not in [y,'id']]
print('y =', y)
print('X =', X)
y = sold
X = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'revenue', 'sold_lag_1', 'sold_lag_2', 'sold_lag_3', 'sold_lag_6', 'sold_lag_12', 'sold_lag_24', 'sold_lag_36', 'iteam_sold_avg', 'state_sold_avg', 'store_sold_avg', 'cat_sold_avg', 'dept_sold_avg', 'cat_dept_sold_avg', 'store_item_sold_avg', 'cat_item_sold_avg', 'dept_item_sold_avg', 'state_store_sold_avg', 'state_store_cat_sold_avg', 'store_cat_dept_sold_avg', 'rolling_sold_mean', 'expanding_sold_mean', 'selling_trend']
In [90]:
#Shap of store CA_1
df0 = dfshap[dfshap['store_id']==0]
validshap0 = df0[(df0['d']>=1914) & (df0['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_1.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
In [92]:
#Shap of store CA_2
df1 = dfshap[dfshap['store_id']==1]
validshap0 = df1[(df1['d']>=1914) & (df1['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_2.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
In [93]:
#Shap of store CA_3
df2 = dfshap[dfshap['store_id']==2]
validshap0 = df2[(df2['d']>=1914) & (df2['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_3.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
In [94]:
#Shap of store CA_4
df3 = dfshap[dfshap['store_id']==3]
validshap0 = df3[(df3['d']>=1914) & (df3['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_4.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
In [8]:
#Shap of store TX_1
df4 = dfshap[dfshap['store_id']==4]
validshap0 = df4[(df4['d']>=1914) & (df4['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelTX_1.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
In [96]:
#Shap of store TX_2
df5 = dfshap[dfshap['store_id']==5]
validshap0 = df5[(df5['d']>=1914) & (df5['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelTX_2.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
In [97]:
#Shap of store TX_3
df6 = dfshap[dfshap['store_id']==6]
validshap0 = df6[(df6['d']>=1914) & (df6['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelTX_3.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
In [7]:
#Shap of store WI_1
df7 = dfshap[dfshap['store_id']==7]
validshap0 = df7[(df7['d']>=1914) & (df7['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelWI_1.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
In [9]:
#Shap of store WI_2
df8 = dfshap[dfshap['store_id']==8]
validshap0 = df8[(df8['d']>=1914) & (df8['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelWI_2.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
In [10]:
#Shap of store WI_3
df9 = dfshap[dfshap['store_id']==9]
validshap0 = df9[(df9['d']>=1914) & (df9['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelWI_3.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])

From the 10 shap value plots of 10 stores, we can find that revenue,rolling_sold_mean, selling_trend, sell_price and store_item_sold_avg are most important features. So we can say that price and past sales play important roles on prediction. Time(include date information and lag) doesn't effect much of the model

ICE for store CA_1

Note: In above, we mentioned that we had 10 models and corresponding stores. Accordingly, our group select one model(CA_1) as representation, and finish entire works related to ICE and partial dependence in the following. Moreover, the ICE and PDP results of rest 9 models and stores could be completed as the same way, so we decide not to repeat the same work with the rest 9(duplicated but massive work).

In [12]:
dfshap = pd.read_pickle('data.pkl')
In [13]:
y = 'sold'
X = [name for name in dfshap.columns if name not in [y,'id']]
print('y =', y)
print('X =', X)
y = sold
X = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'revenue', 'sold_lag_1', 'sold_lag_2', 'sold_lag_3', 'sold_lag_6', 'sold_lag_12', 'sold_lag_24', 'sold_lag_36', 'iteam_sold_avg', 'state_sold_avg', 'store_sold_avg', 'cat_sold_avg', 'dept_sold_avg', 'cat_dept_sold_avg', 'store_item_sold_avg', 'cat_item_sold_avg', 'dept_item_sold_avg', 'state_store_sold_avg', 'state_store_cat_sold_avg', 'store_cat_dept_sold_avg', 'rolling_sold_mean', 'expanding_sold_mean', 'selling_trend']
In [15]:
#split the data
valid = dfshap[(dfshap['d']>=1914) & (dfshap['d']<1942)][['id','d','sold']]
test = dfshap[dfshap['d']>=1942][['id','d','sold']]
eval_preds = test['sold']
valid_preds = valid['sold']
In [11]:
def par_dep(xs, frame, model, resolution=20, bins=None):
    
    """ Creates Pandas DataFrame containing partial dependence for a 
        single variable.
    
    Args:
        xs: Variable for which to calculate partial dependence.
        frame: Pandas DataFrame for which to calculate partial dependence.
        model: XGBoost model for which to calculate partial dependence.
        resolution: The number of points across the domain of xs for which 
                    to calculate partial dependence, default 20.
        bins: List of values at which to set xs, default 20 equally-spaced 
              points between column minimum and maximum.
    
    Returns:
        Pandas DataFrame containing partial dependence values.
        
    """
    
    # turn off pesky Pandas copy warning
    pd.options.mode.chained_assignment = None
    
    # initialize empty Pandas DataFrame with correct column names
    par_dep_frame = pd.DataFrame(columns=[xs, 'partial_dependence'])
    
    # cache original column values 
    col_cache = frame.loc[:, xs].copy(deep=True)
  
    # determine values at which to calculate partial dependence
    if bins == None:
        min_ = frame[xs].min()
        max_ = frame[xs].max()
        by = (max_ - min_)/resolution
        bins = np.arange(min_, max_, by)
        
    # calculate partial dependence  
    # by setting column of interest to constant 
    # and scoring the altered data and taking the mean of the predictions
    for j in bins:
        frame.loc[:, xs] = j
        dframe = xgb.DMatrix(frame)
        par_dep_i = pd.DataFrame(model.predict(dframe, ntree_limit=model.best_ntree_limit))
        par_dep_j = par_dep_i.mean()[0]
        par_dep_frame = par_dep_frame.append({xs:j,
                                              'partial_dependence': par_dep_j}, 
                                              ignore_index=True)
        
    # return input frame to original cached state    
    frame.loc[:, xs] = col_cache

    return par_dep_frame

partial dependence for the most important input variables in the GBM

In [16]:
# PD of store CA_1
df0 = dfshap[dfshap['store_id']==0]
validshap0 = df0[(df0['d']>=1914) & (df0['d']<1942)]
#dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_1.pkl')
In [26]:
par_dep_rolling_sold_mean = par_dep('rolling_sold_mean', validshap0[X], xgb_model)   # calculate partial dependence for rolling sold mean value

# display partial dependence for rolling sold mean
par_dep_rolling_sold_mean
Out[26]:
rolling_sold_mean partial_dependence
0 0.000000 1.405705
1 3.628125 1.631906
2 7.256250 2.442842
3 10.884375 2.547370
4 14.512500 5.146945
5 18.140625 5.146945
6 21.768750 5.146945
7 25.396875 5.146945
8 29.025000 5.146945
9 32.653125 5.146945
10 36.281250 5.146945
11 39.909375 5.146945
12 43.537500 5.146945
13 47.165625 5.146945
14 50.793750 5.146945
15 54.421875 5.146945
16 58.050000 5.146945
17 61.678125 5.146945
18 65.306250 5.146945
19 68.934375 5.146945
In [22]:
par_dep_selling_trend = par_dep('selling_trend', validshap0[X], xgb_model) # calculate partial dependence for selling trend

## display partial dependence for selling trend
par_dep_selling_trend
Out[22]:
selling_trend partial_dependence
0 -35.625 0.862477
1 -29.750 0.862477
2 -23.875 0.862477
3 -18.000 0.862477
4 -12.125 0.862477
5 -6.250 0.862477
6 -0.375 1.166900
7 5.500 4.870893
8 11.375 7.259646
9 17.250 7.259646
10 23.125 7.259646
11 29.000 7.259646
12 34.875 7.259646
13 40.750 7.259646
14 46.625 7.259646
15 52.500 7.259646
16 58.375 7.259646
17 64.250 7.259646
18 70.125 7.259646
19 76.000 7.259646
In [23]:
par_dep_sell_price = par_dep('sell_price', validshap0[X], xgb_model)# calculate partial dependence for sell price

# display partial dependence for sell price
par_dep_sell_price
Out[23]:
sell_price partial_dependence
0 0.229980 5.315265
1 1.716699 1.945405
2 3.203418 1.503798
3 4.690137 1.412638
4 6.176855 1.346705
5 7.663574 1.334537
6 9.150293 1.328165
7 10.637012 1.321059
8 12.123730 1.314348
9 13.610449 1.312560
10 15.097168 1.305304
11 16.583887 1.305304
12 18.070605 1.305304
13 19.557324 1.305304
14 21.044043 1.305304
15 22.530762 1.305304
16 24.017480 1.305304
17 25.504199 1.305304
18 26.990918 1.305304
19 28.477637 1.305304
In [24]:
par_dep_revenue = par_dep('revenue', validshap0[X], xgb_model)# calculate partial dependence for revenue

# display partial dependence for revenue
par_dep_revenue
Out[24]:
revenue partial_dependence
0 0.000000 1.044398
1 28.388672 4.384423
2 56.777344 6.234501
3 85.166016 6.234501
4 113.554688 6.234501
5 141.943359 6.234501
6 170.332031 6.234501
7 198.720703 6.234501
8 227.109375 6.234501
9 255.498047 6.234501
10 283.886719 6.234501
11 312.275391 6.234501
12 340.664062 6.234501
13 369.052734 6.234501
14 397.441406 6.234501
15 425.830078 6.234501
16 454.218750 6.234501
17 482.607422 6.234501
18 510.996094 6.234501
19 539.384766 6.234501
In [25]:
par_dep_store_item_sold_avg = par_dep('store_item_sold_avg', validshap0[X], xgb_model)# calculate partial dependence for store item sold average 

# display partial dependence for store item sold average 
par_dep_store_item_sold_avg
Out[25]:
store_item_sold_avg partial_dependence
0 0.011711 1.319814
1 3.292961 2.250013
2 6.574211 3.037568
3 9.855461 3.412627
4 13.136711 4.277071
5 16.417961 4.277071
6 19.699211 4.277071
7 22.980461 4.277071
8 26.261711 4.277071
9 29.542961 4.277071
10 32.824211 4.277071
11 36.105461 4.277071
12 39.386711 4.277071
13 42.667961 4.277071
14 45.949211 4.277071
15 49.230461 4.277071
16 52.511711 4.277071
17 55.792961 4.277071
18 59.074211 4.277071
19 62.355461 4.277071

Helper function for finding percentiles of predictions

In [35]:
def get_percentile_dict(yhat, id_, frame):

    """ Returns the percentiles of a column, yhat, as the indices based on 
        another column id_.
    
    Args:
        yhat: Column in which to find percentiles.
        id_: Id column that stores indices for percentiles of yhat.
        frame: Pandas DataFrame containing yhat and id_. 
    
    Returns:
        Dictionary of percentile values and index column values.
    
    """
    
    # create a copy of frame and sort it by yhat
    sort_df = frame.copy(deep=True)
    sort_df.sort_values(yhat, inplace=True)
    sort_df.reset_index(inplace=True)
    
    # find top and bottom percentiles
    percentiles_dict = {}
    percentiles_dict[0] = sort_df.loc[0, id_]
    percentiles_dict[99] = sort_df.loc[sort_df.shape[0]-1, id_]

    # find 10th-90th percentiles
    inc = sort_df.shape[0]//10
    for i in range(1, 10):
        percentiles_dict[i * 10] = sort_df.loc[i * inc,  id_]

    return percentiles_dict
In [29]:
df0 = dfshap[dfshap['store_id']==0]
validshap0 = df0[(df0['d']>=1914) & (df0['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
#dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_1.pkl')

Find some percentiles of yhat in the test data

The values for ID that correspond to the maximum, minimum, and deciles of p_sold are displayed below. ICE will be calculated for the rows of the test dataset associated with these ID values.

In [30]:
yhat_test = pd.concat([validshap0.reset_index(drop=True), pd.DataFrame(xgb_model.predict(dvaild, 
                                                                                   ntree_limit=xgb_model.best_ntree_limit))],
                      axis=1)
In [31]:
yhat_test
Out[31]:
id item_id dept_id cat_id store_id state_id d sold wm_yr_wk weekday wday month year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price revenue sold_lag_1 sold_lag_2 sold_lag_3 sold_lag_6 sold_lag_12 sold_lag_24 sold_lag_36 iteam_sold_avg state_sold_avg store_sold_avg cat_sold_avg dept_sold_avg cat_dept_sold_avg store_item_sold_avg cat_item_sold_avg dept_item_sold_avg state_store_sold_avg state_store_cat_sold_avg store_cat_dept_sold_avg rolling_sold_mean expanding_sold_mean selling_trend 0
0 14370 1437 3 1 0 0 1914 0 11613 1 3 4 2016 -1 -1 -1 -1 0 0 0 8.382812 0.000000 1.0 1.0 0.0 1.0 0.0 0.0 3.0 0.217041 1.21875 1.307617 0.562500 0.697754 0.697754 0.322266 0.217041 0.217041 1.307617 0.803711 1.022461 1.000000 0.314209 -0.322266 0.017493
1 14380 1438 3 1 0 0 1914 0 11613 1 3 4 2016 -1 -1 -1 -1 0 0 0 3.970703 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.260498 1.21875 1.307617 0.562500 0.697754 0.697754 0.254639 0.260498 0.260498 1.307617 0.803711 1.022461 0.142822 0.258301 -0.254639 0.005635
2 14390 1439 3 1 0 0 1914 0 11613 1 3 4 2016 -1 -1 -1 -1 0 0 0 2.970703 0.000000 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.076904 1.21875 1.307617 0.562500 0.697754 0.697754 0.157349 0.076904 0.076904 1.307617 0.803711 1.022461 0.714355 0.150879 -0.157349 -0.005114
3 14400 1440 3 1 0 0 1914 0 11613 1 3 4 2016 -1 -1 -1 -1 0 0 0 4.640625 0.000000 2.0 7.0 3.0 1.0 3.0 3.0 1.0 2.017578 1.21875 1.307617 0.562500 0.697754 0.697754 1.699219 2.017578 2.017578 1.307617 0.803711 1.022461 2.000000 1.722656 -1.699219 -0.013007
4 14410 1441 3 1 0 0 1914 1 11613 1 3 4 2016 -1 -1 -1 -1 0 0 0 2.880859 2.880859 4.0 2.0 2.0 1.0 1.0 0.0 2.0 0.757324 1.21875 1.307617 0.562500 0.697754 0.697754 0.961426 0.757324 0.757324 1.307617 0.803711 1.022461 1.857422 0.969238 0.038574 1.022326
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
85367 14320 1432 2 0 0 0 1941 2 11617 3 2 5 2016 -1 -1 -1 -1 0 0 0 2.980469 5.960938 4.0 3.0 4.0 1.0 2.0 1.0 0.0 0.799805 1.21875 1.307617 1.626953 2.037109 2.037109 0.581055 0.799805 0.799805 1.307617 1.938477 2.470703 2.285156 0.589355 1.418945 2.012868
85368 14330 1433 2 0 0 0 1941 0 11617 3 2 5 2016 -1 -1 -1 -1 0 0 0 2.480469 0.000000 0.0 0.0 3.0 1.0 3.0 0.0 0.0 0.431396 1.21875 1.307617 1.626953 2.037109 2.037109 0.709473 0.431396 0.431396 1.307617 1.938477 2.470703 0.856934 0.719727 -0.709473 -0.003395
85369 14340 1434 2 0 0 0 1941 1 11617 3 2 5 2016 -1 -1 -1 -1 0 0 0 3.980469 3.980469 0.0 3.0 0.0 2.0 1.0 1.0 2.0 0.706055 1.21875 1.307617 1.626953 2.037109 2.037109 0.950195 0.706055 0.706055 1.307617 1.938477 2.470703 1.142578 0.963867 0.049805 0.999600
85370 14350 1435 2 0 0 0 1941 1 11617 3 2 5 2016 -1 -1 -1 -1 0 0 0 1.280273 1.280273 2.0 1.0 0.0 1.0 3.0 1.0 0.0 0.638672 1.21875 1.307617 1.626953 2.037109 2.037109 0.911621 0.638672 0.638672 1.307617 1.938477 2.470703 1.571289 0.924805 0.088379 0.997703
85371 14360 1436 2 0 0 0 1941 5 11617 3 2 5 2016 -1 -1 -1 -1 0 0 0 1.000000 5.000000 10.0 2.0 7.0 3.0 0.0 6.0 5.0 0.618164 1.21875 1.307617 1.626953 2.037109 2.037109 0.857422 0.618164 0.618164 1.307617 1.938477 2.470703 4.714844 0.869629 4.140625 4.884073

85372 rows × 45 columns

In [32]:
yhat_test = yhat_test.rename(columns={0:'p_sold'})
In [37]:
percentile_dict = get_percentile_dict('p_sold', 'id', yhat_test)
percentile_dict
Out[37]:
{0: 11110,
 99: 7020,
 10: 28080,
 20: 2990,
 30: 5550,
 40: 4890,
 50: 18880,
 60: 17860,
 70: 22180,
 80: 19110,
 90: 11420}

Calculate ICE curve values

ICE values represent a model's prediction for a row of data while an input variable of interest is varied across its domain. The values of the input variable are chosen to match the values at which partial dependence was calculated earlier, and ICE is calculated for the top three most important variables and for rows at each percentile of the test dataset.

In [41]:
# retreive bins from original partial dependence calculation

bins_rolling_sold_mean = list(par_dep_rolling_sold_mean['rolling_sold_mean'])
bins_selling_trend = list(par_dep_selling_trend['selling_trend'])
bins_sell_price = list(par_dep_sell_price['sell_price'])
bins_revenue = list(par_dep_revenue['revenue'])
bins_store_item_sold_avg = list(par_dep_store_item_sold_avg['store_item_sold_avg'])
# for each percentile in percentile_dict
# create a new column in the par_dep frame 
# representing the ICE curve for that percentile
# and the variables of interest
for i in sorted(percentile_dict.keys()):
    
    col_name = 'Percentile_' + str(i)
    
    # ICE curves for rolling_sold_mean across percentiles at bins_rolling_sold_mean intervals
    par_dep_rolling_sold_mean[col_name] = par_dep('rolling_sold_mean', 
                                    validshap0[validshap0['id'] == int(percentile_dict[i])][X],  
                                    xgb_model, 
                                    bins=bins_rolling_sold_mean)['partial_dependence']
    
    # ICE curves for selling_trend across percentiles at bins_selling_trend intervals
    par_dep_selling_trend[col_name] = par_dep('selling_trend', 
                                    validshap0[validshap0['id'] == int(percentile_dict[i])][X],  
                                    xgb_model, 
                                    bins=bins_selling_trend)['partial_dependence']
    

    par_dep_sell_price[col_name] = par_dep('sell_price', 
                                    validshap0[validshap0['id'] == int(percentile_dict[i])][X],  
                                    xgb_model, 
                                    bins=bins_sell_price)['partial_dependence']
    
    par_dep_revenue[col_name] = par_dep('revenue', 
                                    validshap0[validshap0['id'] == int(percentile_dict[i])][X],  
                                    xgb_model, 
                                    bins=bins_revenue)['partial_dependence']  
    
    par_dep_store_item_sold_avg[col_name] = par_dep('store_item_sold_avg', 
                                    validshap0[validshap0['id'] == int(percentile_dict[i])][X],  
                                    xgb_model, 
                                    bins=bins_store_item_sold_avg)['partial_dependence']     
In [42]:
par_dep_rolling_sold_mean
Out[42]:
rolling_sold_mean partial_dependence Percentile_0 Percentile_10 Percentile_20 Percentile_30 Percentile_40 Percentile_50 Percentile_60 Percentile_70 Percentile_80 Percentile_90 Percentile_99
0 0.000000 1.405705 11.391554 0.057142 0.401738 0.470152 1.781317 0.825988 0.563978 0.495526 0.686853 3.131929 36.260864
1 3.628125 1.631906 11.659171 0.314811 0.633321 0.709924 1.950554 1.108614 0.747877 0.758909 1.025778 3.320696 37.040863
2 7.256250 2.442842 12.282961 2.001419 2.023170 2.071343 2.345043 1.635603 1.205043 1.825219 1.504886 3.524072 40.117382
3 10.884375 2.547370 12.489581 1.608717 1.855129 1.876040 2.467632 1.800873 1.166088 1.962291 1.654724 3.872616 51.139702
4 14.512500 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
5 18.140625 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
6 21.768750 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
7 25.396875 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
8 29.025000 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
9 32.653125 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
10 36.281250 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
11 39.909375 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
12 43.537500 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
13 47.165625 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
14 50.793750 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
15 54.421875 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
16 58.050000 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
17 61.678125 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
18 65.306250 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
19 68.934375 5.146945 13.233131 6.360030 6.602073 6.539042 4.061515 4.970151 3.218238 6.843673 2.130698 4.321817 58.572639
In [43]:
par_dep_selling_trend
Out[43]:
selling_trend partial_dependence Percentile_0 Percentile_10 Percentile_20 Percentile_30 Percentile_40 Percentile_50 Percentile_60 Percentile_70 Percentile_80 Percentile_90 Percentile_99
0 -35.625 0.862477 7.454772 0.003423 0.231637 0.258000 0.878173 0.408587 0.377102 0.196645 0.114285 1.739872 42.219135
1 -29.750 0.862477 7.454772 0.003423 0.231637 0.258000 0.878173 0.408587 0.377102 0.196645 0.114285 1.739872 42.219135
2 -23.875 0.862477 7.454772 0.003423 0.231637 0.258000 0.878173 0.408587 0.377102 0.196645 0.114285 1.739872 42.219135
3 -18.000 0.862477 7.454772 0.003423 0.231637 0.258000 0.878173 0.408587 0.377102 0.196645 0.114285 1.739872 42.219135
4 -12.125 0.862477 7.454772 0.003423 0.231637 0.258000 0.878173 0.408587 0.377102 0.196645 0.114285 1.739872 42.219135
5 -6.250 0.862477 7.454772 0.003423 0.231637 0.258000 0.878173 0.408587 0.377102 0.196645 0.114285 1.739872 42.219135
6 -0.375 1.166900 9.963648 0.053204 0.299928 0.326535 0.989732 0.528553 0.482595 0.308253 0.588557 1.889368 61.547375
7 5.500 4.870893 13.395994 3.879870 3.856616 3.898650 4.657331 4.349719 3.919146 4.197729 4.610061 5.265319 66.423927
8 11.375 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
9 17.250 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
10 23.125 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
11 29.000 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
12 34.875 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
13 40.750 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
14 46.625 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
15 52.500 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
16 58.375 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
17 64.250 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
18 70.125 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
19 76.000 7.259646 17.070074 5.811790 6.233037 6.293153 6.795502 7.596638 6.260042 6.121205 9.054568 7.741347 84.044258
In [44]:
par_dep_sell_price
Out[44]:
sell_price partial_dependence Percentile_0 Percentile_10 Percentile_20 Percentile_30 Percentile_40 Percentile_50 Percentile_60 Percentile_70 Percentile_80 Percentile_90 Percentile_99
0 0.229980 5.315265 43.613556 0.403793 1.552037 1.864749 7.028635 1.440884 2.201071 4.512729 0.831660 9.136844 111.916443
1 1.716699 1.945405 16.537024 0.141835 0.534474 0.596123 2.185543 0.789938 0.776355 1.400958 0.650315 3.309042 58.472782
2 3.203418 1.503798 11.981726 0.112626 0.432062 0.506477 1.841766 0.740183 0.640459 0.965370 0.642328 3.025286 41.732750
3 4.690137 1.412638 11.465116 0.110067 0.423893 0.496964 1.761299 0.738179 0.631728 0.802770 0.642065 2.966386 41.669006
4 6.176855 1.346705 11.309155 0.070059 0.410402 0.486401 1.728100 0.734262 0.621008 0.633945 0.648709 2.882254 41.286022
5 7.663574 1.334537 11.522042 0.070059 0.408927 0.485556 1.714664 0.734262 0.619534 0.590534 0.648709 2.867996 41.184170
6 9.150293 1.328165 11.522042 0.070059 0.408741 0.482789 1.709030 0.734262 0.619348 0.561044 0.648709 2.867004 41.184170
7 10.637012 1.321059 11.513778 0.070059 0.408629 0.482283 1.703569 0.733674 0.619297 0.546430 0.648709 2.862278 41.184170
8 12.123730 1.314348 11.513191 0.069205 0.406776 0.479625 1.689795 0.730046 0.612487 0.538434 0.644400 2.849201 41.184170
9 13.610449 1.312560 11.513191 0.069205 0.406776 0.479625 1.687923 0.730046 0.612487 0.533754 0.644400 2.848265 41.184170
10 15.097168 1.305304 11.513191 0.065259 0.402364 0.472470 1.686831 0.727153 0.599095 0.517723 0.644400 2.845014 41.184170
11 16.583887 1.305304 11.513191 0.065259 0.402364 0.472470 1.686831 0.727153 0.599095 0.517723 0.644400 2.845014 41.184170
12 18.070605 1.305304 11.513191 0.065259 0.402364 0.472470 1.686831 0.727153 0.599095 0.517723 0.644400 2.845014 41.184170
13 19.557324 1.305304 11.513191 0.065259 0.402364 0.472470 1.686831 0.727153 0.599095 0.517723 0.644400 2.845014 41.184170
14 21.044043 1.305304 11.513191 0.065259 0.402364 0.472470 1.686831 0.727153 0.599095 0.517723 0.644400 2.845014 41.184170
15 22.530762 1.305304 11.513191 0.065259 0.402364 0.472470 1.686831 0.727153 0.599095 0.517723 0.644400 2.845014 41.184170
16 24.017480 1.305304 11.513191 0.065259 0.402364 0.472470 1.686831 0.727153 0.599095 0.517723 0.644400 2.845014 41.184170
17 25.504199 1.305304 11.513191 0.065259 0.402364 0.472470 1.686831 0.727153 0.599095 0.517723 0.644400 2.845014 41.184170
18 26.990918 1.305304 11.513191 0.065259 0.402364 0.472470 1.686831 0.727153 0.599095 0.517723 0.644400 2.845014 41.184170
19 28.477637 1.305304 11.513191 0.065259 0.402364 0.472470 1.686831 0.727153 0.599095 0.517723 0.644400 2.845014 41.184170
In [45]:
par_dep_revenue
Out[45]:
revenue partial_dependence Percentile_0 Percentile_10 Percentile_20 Percentile_30 Percentile_40 Percentile_50 Percentile_60 Percentile_70 Percentile_80 Percentile_90 Percentile_99
0 0.000000 1.044398 5.525473 0.068136 0.428681 0.477435 1.558239 0.804745 0.597089 0.515512 0.798664 2.693827 20.474428
1 28.388672 4.384423 13.280248 1.489213 3.606967 3.993641 5.060600 8.479890 3.699967 1.257946 10.618081 7.547360 39.182301
2 56.777344 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
3 85.166016 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
4 113.554688 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
5 141.943359 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
6 170.332031 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
7 198.720703 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
8 227.109375 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
9 255.498047 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
10 283.886719 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
11 312.275391 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
12 340.664062 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
13 369.052734 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
14 397.441406 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
15 425.830078 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
16 454.218750 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
17 482.607422 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
18 510.996094 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
19 539.384766 6.234501 16.419436 1.673550 6.152186 6.468204 8.067790 14.239102 6.196856 1.478832 15.874047 9.949030 58.572639
In [46]:
par_dep_store_item_sold_avg
Out[46]:
store_item_sold_avg partial_dependence Percentile_0 Percentile_10 Percentile_20 Percentile_30 Percentile_40 Percentile_50 Percentile_60 Percentile_70 Percentile_80 Percentile_90 Percentile_99
0 0.011711 1.319814 15.007802 0.067998 0.395295 0.486484 1.698783 0.867582 0.599492 0.468625 0.615485 2.891977 47.024731
1 3.292961 2.250013 11.251832 0.746247 1.273253 1.334335 2.896886 1.726239 1.500481 1.657945 1.234944 4.754498 45.473457
2 6.574211 3.037568 12.174562 1.292391 1.871261 1.909622 3.777044 2.326695 2.166357 2.592297 1.927503 5.907976 48.288494
3 9.855461 3.412627 13.107552 1.418930 2.242983 2.201995 4.437369 2.516432 2.421859 2.896825 2.058312 6.758017 51.848751
4 13.136711 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
5 16.417961 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
6 19.699211 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
7 22.980461 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
8 26.261711 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
9 29.542961 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
10 32.824211 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
11 36.105461 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
12 39.386711 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
13 42.667961 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
14 45.949211 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
15 49.230461 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
16 52.511711 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
17 55.792961 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
18 59.074211 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205
19 62.355461 4.277071 15.412866 2.239002 3.183923 2.949689 5.586397 3.100653 2.932425 3.779788 2.539493 8.351839 56.516205

Plotting partial dependence and ICE

In [47]:
def plot_par_dep_ICE(xs, par_dep_frame):

    
    """ Plots ICE overlayed onto partial dependence for a single variable.
    
    Args: 
        xs: Name of variable for which to plot ICE and partial dependence.
        par_dep_frame: Name of Pandas DataFrame containing ICE and partial
                       dependence values.
    
    """
    
    # initialize figure and axis
    fig, ax = plt.subplots()
    
    # plot ICE curves
    par_dep_frame.drop('partial_dependence', axis=1).plot(x=xs, 
                                                          colormap='gnuplot',
                                                          ax=ax)

    # overlay partial dependence, annotate plot
    par_dep_frame.plot(title='Partial Dependence and ICE for ' + str(xs),
                       x=xs, 
                       y='partial_dependence',
                       style='r-', 
                       linewidth=3, 
                       ax=ax)

    # add legend
    _ = plt.legend(bbox_to_anchor=(1.05, 0),
                   loc=3, 
                   borderaxespad=0.)
In [48]:
plot_par_dep_ICE('rolling_sold_mean', par_dep_rolling_sold_mean) # plot partial dependence and ICE for rolling_sold_mean
In [49]:
plot_par_dep_ICE('selling_trend', par_dep_selling_trend) # plot partial dependence and ICE for selling trend
In [50]:
plot_par_dep_ICE('revenue', par_dep_revenue) # plot partial dependence and ICE for revenue
In [51]:
plot_par_dep_ICE('sell_price', par_dep_sell_price) # plot partial dependence and ICE forsell price
In [52]:
plot_par_dep_ICE('store_item_sold_avg', par_dep_store_item_sold_avg) # plot partial dependence and ICE for store item sold average
In [ ]: